package org.apache.solr.schema;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.schema.PreAnalyzedField.ParseResult;
import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser;

/**
 * Simple plain text format parser for {@link PreAnalyzedField}. <h2>
 * Serialization format</h2>
 * <p>
 * The format of the serialization is as follows:
 * 
 * <pre>
 * content ::= version (stored)? tokens
 * version ::= digit+ " "
 * ; stored field value - any "=" inside must be escaped!
 * stored ::= "=" text "="
 * tokens ::= (token ((" ") + token)*)*
 * token ::= text ("," attrib)*
 * attrib ::= name '=' value
 * name ::= text
 * value ::= text
 * </pre>
 * <p>
 * Special characters in "text" values can be escaped using the escape character
 * \ . The following escape sequences are recognized:
 * 
 * <pre>
 * "\ " - literal space character
 * "\," - literal , character
 * "\=" - literal = character
 * "\\" - literal \ character
 * "\n" - newline
 * "\r" - carriage return
 * "\t" - horizontal tab
 * </pre>
 * 
 * Please note that Unicode sequences (e.g. &#92;u0001) are not supported.
 * <h2>Supported attribute names</h2>
 * The following token attributes are supported, and identified with short
 * symbolic names:
 * 
 * <pre>
 * i - position increment (integer)
 * s - token offset, start position (integer)
 * e - token offset, end position (integer)
 * t - token type (string)
 * f - token flags (hexadecimal integer)
 * p - payload (bytes in hexadecimal format)
 * </pre>
 * 
 * Token positions are tracked and implicitly added to the token stream - the
 * start and end offsets consider only the term text and whitespace, and exclude
 * the space taken by token attributes.
 * <h2>Example token streams</h2>
 * 
 * <pre>
 * 1 one two three
 *   - version 1
 *   - stored: 'null'
 *   - tok: '(term=one,startOffset=0,endOffset=3)'
 *   - tok: '(term=two,startOffset=4,endOffset=7)'
 *   - tok: '(term=three,startOffset=8,endOffset=13)'
 *  1 one  two   three 
 *   - version 1
 *   - stored: 'null'
 *   - tok: '(term=one,startOffset=1,endOffset=4)'
 *   - tok: '(term=two,startOffset=6,endOffset=9)'
 *   - tok: '(term=three,startOffset=12,endOffset=17)'
 * 1 one,s=123,e=128,i=22  two three,s=20,e=22
 *   - version 1
 *   - stored: 'null'
 *   - tok: '(term=one,positionIncrement=22,startOffset=123,endOffset=128)'
 *   - tok: '(term=two,positionIncrement=1,startOffset=5,endOffset=8)'
 *   - tok: '(term=three,positionIncrement=1,startOffset=20,endOffset=22)'
 * 1 \ one\ \,,i=22,a=\, two\=
 * 
 *   \n,\ =\   \
 *   - version 1
 *   - stored: 'null'
 *   - tok: '(term= one ,,positionIncrement=22,startOffset=0,endOffset=6)'
 *   - tok: '(term=two=
 * 
 *   
 *  ,positionIncrement=1,startOffset=7,endOffset=15)'
 *   - tok: '(term=\,positionIncrement=1,startOffset=17,endOffset=18)'
 * 1 ,i=22 ,i=33,s=2,e=20 , 
 *   - version 1
 *   - stored: 'null'
 *   - tok: '(term=,positionIncrement=22,startOffset=0,endOffset=0)'
 *   - tok: '(term=,positionIncrement=33,startOffset=2,endOffset=20)'
 *   - tok: '(term=,positionIncrement=1,startOffset=2,endOffset=2)'
 * 1 =This is the stored part with \= 
 *  \n    \t escapes.=one two three 
 *   - version 1
 *   - stored: 'This is the stored part with = 
 *  \n    \t escapes.'
 *   - tok: '(term=one,startOffset=0,endOffset=3)'
 *   - tok: '(term=two,startOffset=4,endOffset=7)'
 *   - tok: '(term=three,startOffset=8,endOffset=13)'
 * 1 ==
 *   - version 1
 *   - stored: ''
 *   - (no tokens)
 * 1 =this is a test.=
 *   - version 1
 *   - stored: 'this is a test.'
 *   - (no tokens)
 * </pre>
 */
public final class SimplePreAnalyzedParser implements PreAnalyzedParser {
	static final String VERSION = "1";

	private static class Tok {
		StringBuilder token = new StringBuilder();
		Map<String, String> attr = new HashMap<String, String>();

		public boolean isEmpty() {
			return token.length() == 0 && attr.size() == 0;
		}

		public void reset() {
			token.setLength(0);
			attr.clear();
		}

		public String toString() {
			return "tok='" + token + "',attr=" + attr;
		}
	}

	// parser state
	private static enum S {
		TOKEN, NAME, VALUE, UNDEF
	};

	private static final byte[] EMPTY_BYTES = new byte[0];

	/** Utility method to convert byte array to a hex string. */
	static byte[] hexToBytes(String hex) {
		if (hex == null) {
			return EMPTY_BYTES;
		}
		hex = hex.replaceAll("\\s+", "");
		if (hex.length() == 0) {
			return EMPTY_BYTES;
		}
		ByteArrayOutputStream baos = new ByteArrayOutputStream(hex.length() / 2);
		byte b;
		for (int i = 0; i < hex.length(); i++) {
			int high = charToNibble(hex.charAt(i));
			int low = 0;
			if (i < hex.length() - 1) {
				i++;
				low = charToNibble(hex.charAt(i));
			}
			b = (byte) (high << 4 | low);
			baos.write(b);
		}
		return baos.toByteArray();
	}

	static final int charToNibble(char c) {
		if (c >= '0' && c <= '9') {
			return c - '0';
		} else if (c >= 'a' && c <= 'f') {
			return 0xa + (c - 'a');
		} else if (c >= 'A' && c <= 'F') {
			return 0xA + (c - 'A');
		} else {
			throw new RuntimeException("Not a hex character: '" + c + "'");
		}
	}

	static String bytesToHex(byte bytes[], int offset, int length) {
		StringBuilder sb = new StringBuilder();
		for (int i = offset; i < offset + length; ++i) {
			sb.append(Integer.toHexString(0x0100 + (bytes[i] & 0x00FF))
					.substring(1));
		}
		return sb.toString();
	}

	public SimplePreAnalyzedParser() {

	}

	@Override
	public ParseResult parse(Reader reader, AttributeSource parent)
			throws IOException {
		ParseResult res = new ParseResult();
		StringBuilder sb = new StringBuilder();
		char[] buf = new char[128];
		int cnt;
		while ((cnt = reader.read(buf)) > 0) {
			sb.append(buf, 0, cnt);
		}
		String val = sb.toString();
		// empty string - accept even without version number
		if (val.length() == 0) {
			return res;
		}
		// first consume the version
		int idx = val.indexOf(' ');
		if (idx == -1) {
			throw new IOException("Missing VERSION token");
		}
		String version = val.substring(0, idx);
		if (!VERSION.equals(version)) {
			throw new IOException("Unknown VERSION " + version);
		}
		val = val.substring(idx + 1);
		// then consume the optional stored part
		int tsStart = 0;
		boolean hasStored = false;
		StringBuilder storedBuf = new StringBuilder();
		if (val.charAt(0) == '=') {
			hasStored = true;
			if (val.length() > 1) {
				for (int i = 1; i < val.length(); i++) {
					char c = val.charAt(i);
					if (c == '\\') {
						if (i < val.length() - 1) {
							c = val.charAt(++i);
							if (c == '=') { // we recognize only \= escape in
											// the stored part
								storedBuf.append('=');
							} else {
								storedBuf.append('\\');
								storedBuf.append(c);
								continue;
							}
						} else {
							storedBuf.append(c);
							continue;
						}
					} else if (c == '=') {
						// end of stored text
						tsStart = i + 1;
						break;
					} else {
						storedBuf.append(c);
					}
				}
				if (tsStart == 0) { // missing end-of-stored marker
					throw new IOException("Missing end marker of stored part");
				}
			} else {
				throw new IOException("Unexpected end of stored field");
			}
		}
		if (hasStored) {
			res.str = storedBuf.toString();
		}
		Tok tok = new Tok();
		StringBuilder attName = new StringBuilder();
		StringBuilder attVal = new StringBuilder();
		// parser state
		S s = S.UNDEF;
		int lastPos = 0;
		for (int i = tsStart; i < val.length(); i++) {
			char c = val.charAt(i);
			if (c == ' ') {
				// collect leftovers
				switch (s) {
				case VALUE:
					if (attVal.length() == 0) {
						throw new IOException("Unexpected character '" + c
								+ "' at position " + i
								+ " - empty value of attribute.");
					}
					if (attName.length() > 0) {
						tok.attr.put(attName.toString(), attVal.toString());
					}
					break;
				case NAME: // attr name without a value ?
					if (attName.length() > 0) {
						throw new IOException("Unexpected character '" + c
								+ "' at position " + i
								+ " - missing attribute value.");
					} else {
						// accept missing att name and value
					}
					break;
				case TOKEN:
				case UNDEF:
					// do nothing, advance to next token
				}
				attName.setLength(0);
				attVal.setLength(0);
				if (!tok.isEmpty() || s == S.NAME) {
					AttributeSource.State state = createState(parent, tok,
							lastPos);
					if (state != null)
						res.states.add(state.clone());
				}
				// reset tok
				s = S.UNDEF;
				tok.reset();
				// skip
				lastPos++;
				continue;
			}
			StringBuilder tgt = null;
			switch (s) {
			case TOKEN:
				tgt = tok.token;
				break;
			case NAME:
				tgt = attName;
				break;
			case VALUE:
				tgt = attVal;
				break;
			case UNDEF:
				tgt = tok.token;
				s = S.TOKEN;
			}
			if (c == '\\') {
				if (s == S.TOKEN)
					lastPos++;
				if (i >= val.length() - 1) { // end

					tgt.append(c);
					continue;
				} else {
					c = val.charAt(++i);
					switch (c) {
					case '\\':
					case '=':
					case ',':
					case ' ':
						tgt.append(c);
						break;
					case 'n':
						tgt.append('\n');
						break;
					case 'r':
						tgt.append('\r');
						break;
					case 't':
						tgt.append('\t');
						break;
					default:
						tgt.append('\\');
						tgt.append(c);
						lastPos++;
					}
				}
			} else {
				// state switch
				if (c == ',') {
					if (s == S.TOKEN) {
						s = S.NAME;
					} else if (s == S.VALUE) { // end of value, start of next
												// attr
						if (attVal.length() == 0) {
							throw new IOException("Unexpected character '" + c
									+ "' at position " + i
									+ " - empty value of attribute.");
						}
						if (attName.length() > 0 && attVal.length() > 0) {
							tok.attr.put(attName.toString(), attVal.toString());
						}
						// reset
						attName.setLength(0);
						attVal.setLength(0);
						s = S.NAME;
					} else {
						throw new IOException("Unexpected character '" + c
								+ "' at position " + i
								+ " - missing attribute value.");
					}
				} else if (c == '=') {
					if (s == S.NAME) {
						s = S.VALUE;
					} else {
						throw new IOException("Unexpected character '" + c
								+ "' at position " + i
								+ " - empty value of attribute.");
					}
				} else {
					tgt.append(c);
					if (s == S.TOKEN)
						lastPos++;
				}
			}
		}
		// collect leftovers
		if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
			// remaining attrib?
			if (s == S.VALUE) {
				if (attName.length() > 0 && attVal.length() > 0) {
					tok.attr.put(attName.toString(), attVal.toString());
				}
			}
			AttributeSource.State state = createState(parent, tok, lastPos);
			if (state != null)
				res.states.add(state.clone());
		}
		return res;
	}

	private static AttributeSource.State createState(AttributeSource a,
			Tok state, int tokenEnd) {
		a.clearAttributes();
		CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
		char[] tokChars = state.token.toString().toCharArray();
		termAtt.copyBuffer(tokChars, 0, tokChars.length);
		int tokenStart = tokenEnd - state.token.length();
		for (Entry<String, String> e : state.attr.entrySet()) {
			String k = e.getKey();
			if (k.equals("i")) {
				// position increment
				int incr = Integer.parseInt(e.getValue());
				PositionIncrementAttribute posIncr = a
						.addAttribute(PositionIncrementAttribute.class);
				posIncr.setPositionIncrement(incr);
			} else if (k.equals("s")) {
				tokenStart = Integer.parseInt(e.getValue());
			} else if (k.equals("e")) {
				tokenEnd = Integer.parseInt(e.getValue());
			} else if (k.equals("y")) {
				TypeAttribute type = a.addAttribute(TypeAttribute.class);
				type.setType(e.getValue());
			} else if (k.equals("f")) {
				FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
				int f = Integer.parseInt(e.getValue(), 16);
				flags.setFlags(f);
			} else if (k.equals("p")) {
				PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
				byte[] data = hexToBytes(e.getValue());
				if (data != null && data.length > 0) {
					p.setPayload(new BytesRef(data));
				}
			} else {
				// unknown attribute
			}
		}
		// handle offset attr
		OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
		offset.setOffset(tokenStart, tokenEnd);
		State resState = a.captureState();
		a.clearAttributes();
		return resState;
	}

	public String toFormattedString(Field f) throws IOException {
		StringBuilder sb = new StringBuilder();
		sb.append(VERSION + " ");
		if (f.fieldType().stored()) {
			String s = f.stringValue();
			if (s != null) {
				// encode the equals sign
				s = s.replaceAll("=", "\\=");
				sb.append('=');
				sb.append(s);
				sb.append('=');
			}
		}
		TokenStream ts = f.tokenStreamValue();
		if (ts != null) {
			StringBuilder tok = new StringBuilder();
			boolean next = false;
			while (ts.incrementToken()) {
				if (next) {
					sb.append(' ');
				} else {
					next = true;
				}
				tok.setLength(0);
				Iterator<Class<? extends Attribute>> it = ts
						.getAttributeClassesIterator();
				String cTerm = null;
				String tTerm = null;
				while (it.hasNext()) {
					Class<? extends Attribute> cl = it.next();
					if (!ts.hasAttribute(cl)) {
						continue;
					}
					Attribute att = ts.getAttribute(cl);
					if (cl.isAssignableFrom(CharTermAttribute.class)) {
						CharTermAttribute catt = (CharTermAttribute) att;
						cTerm = escape(catt.buffer(), catt.length());
					} else if (cl
							.isAssignableFrom(TermToBytesRefAttribute.class)) {
						TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
						char[] tTermChars = tatt.getBytesRef().utf8ToString()
								.toCharArray();
						tTerm = escape(tTermChars, tTermChars.length);
					} else {
						if (tok.length() > 0)
							tok.append(',');
						if (cl.isAssignableFrom(FlagsAttribute.class)) {
							tok.append("f="
									+ Integer
											.toHexString(((FlagsAttribute) att)
													.getFlags()));
						} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
							tok.append("s="
									+ ((OffsetAttribute) att).startOffset()
									+ ",e="
									+ ((OffsetAttribute) att).endOffset());
						} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
							BytesRef p = ((PayloadAttribute) att).getPayload();
							if (p != null && p.length > 0) {
								tok.append("p="
										+ bytesToHex(p.bytes, p.offset,
												p.length));
							} else if (tok.length() > 0) {
								tok.setLength(tok.length() - 1); // remove the
																	// last
																	// comma
							}
						} else if (cl
								.isAssignableFrom(PositionIncrementAttribute.class)) {
							tok.append("i="
									+ ((PositionIncrementAttribute) att)
											.getPositionIncrement());
						} else if (cl.isAssignableFrom(TypeAttribute.class)) {
							tok.append("y="
									+ escape(((TypeAttribute) att).type()));
						} else {

							tok.append(cl.getName() + "="
									+ escape(att.toString()));
						}
					}
				}
				String term = null;
				if (cTerm != null) {
					term = cTerm;
				} else {
					term = tTerm;
				}
				if (term != null && term.length() > 0) {
					if (tok.length() > 0) {
						tok.insert(0, term + ",");
					} else {
						tok.insert(0, term);
					}
				}
				sb.append(tok);
			}
		}
		return sb.toString();
	}

	String escape(String val) {
		return escape(val.toCharArray(), val.length());
	}

	String escape(char[] val, int len) {
		if (val == null || len == 0) {
			return "";
		}
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < len; i++) {
			switch (val[i]) {
			case '\\':
			case '=':
			case ',':
			case ' ':
				sb.append('\\');
				sb.append(val[i]);
				break;
			case '\n':
				sb.append('\\');
				sb.append('n');
				break;
			case '\r':
				sb.append('\\');
				sb.append('r');
				break;
			case '\t':
				sb.append('\\');
				sb.append('t');
				break;
			default:
				sb.append(val[i]);
			}
		}
		return sb.toString();
	}

}
